
import jieba
from collections import Counter
import pandas as pd


def save_seg(filename, cnt):
    f_out = open(filename, 'w+', encoding='utf-8')
    result = cnt.most_common(1000)
    f_out.write("单词" + "," + "数量" + "\n")
    for ix in result:
        f_out.write(ix[0] + "," + str(ix[1]) + "\n")


STOPWORDS = [u'的', u' ', u'\n', u'他', u'地', u'得', u'而', u'了', u'在', u'是', u'我', u'有', u'和', u'就', u'不', u'@', u'都',
             u'啊', u'呀', u'吧', u'也', u'很', u'到', u'说', u'要', u'去', u'你', u'会', u'着', u'没有', u'看', u'好', u'吗', u'']
PUNCTUATIONS = [u'~' u':', u'~', u'。', u'#', u'，', u'“', u'”', u'…', u'？', u'！', u'、', u'；', u'（', u'）', u'...', u'##', u'【', u'】', u'/', u'：']

# 需要进行分词的文件
wj = ['test01']

cnt = Counter()

for file in wj:
    data = pd.read_excel('../data/'+file+'.xls')
    for l in data['微博内容'].astype(str):
        seg_list = jieba.cut(l)
        for seg in seg_list:
            if seg not in STOPWORDS and seg not in PUNCTUATIONS and seg not in wj:
                cnt[seg] = cnt[seg] + 1

    save_seg('../data/test' + ".csv", cnt)  # 保存文件
